suppressPackageStartupMessages(library(tidyverse))
devtools::load_all('~/Google Drive/My Drive/Scripts/R_packages/myUtilities/')
## ℹ Loading myUtilities

Settings

data_dir <- '/Volumes/Mitsu_NGS_3/METTL2A/'

wd <- "~/Google Drive/My Drive/Analysis/METTL2A/"
setwd(wd)

figdir <- paste0(wd, 'Figures/DRS_m3C_sites/Metagene_RNA/')
tabledir <- paste0(wd, 'Tables/DRS_m3C_sites/Metagene_RNA/')

theme_set(
  theme_classic(base_size = 7) +
    theme(legend.position = 'bottom')
)

Functions

paste_wd <- function(path) {
  
  paste0(wd, path)
  
}

calc_base_position <- function(df) {
  
  df |> 
    mutate(transcript_seq = str_split(transcript_seq, '')) |> 
    unnest(transcript_seq) |>
    group_by(transcript_id) |> 
    mutate(position = row_number() - min(row_number()) + 1) |> 
    ungroup() |> 
    dplyr::rename(base = transcript_seq)
  
}

calc_relposition_in_RNA <- function(df) {
  
  df |> 
    dplyr::rename(kmer_middle = position) |> 
    left_join(DRS_methylated_RNAs_annotation) |> 
    mutate(rel_kmer_middle = kmer_middle / length)
  
}

calc_CC_position <- function(df) {
  
  df |> 
    mutate(position = str_locate_all(transcript_seq, 'CC')) |> 
    unnest(position) |> 
    mutate(position = (position[,1] + position[,2]) / 2) |> 
    select(transcript_id, position)
  
}

plot_metagene_distribution_different_adjustment <- function(adjust_value) {
  
  metagene_plot <- 
    rel_position_allC_m3C |> 
    filter(!is.na(genetype2)) |> 
    ggplot(aes(x = rel_kmer_middle, colour = type)) +
    geom_density(adjust = adjust_value) +
    facet_wrap( ~ genetype2, ncol = 1, scales = 'free') + 
    scale_color_manual(values = c('gray', 'blue', 'red'))
  
  metagene_plot |> 
    ggsave_multiple_formats(
      basename = paste0('metageneplot_RNAs_groupedby_type_adjust_', adjust_value),
      outdir = figdir, width = 4, height = 12, fontsize = 7
    )
  
}

Read data

Methylated positions

DRS_methylated_positions <- 
  read_tsv(
    'Tables/DRS_m3C_sites/DRS_methylated_positions_relative_range_2024-04-22.tsv' |> 
      paste_wd()
  )
## Rows: 489 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): transcript_id, gene_name, seqname, gene_type, ref_kmer, genetype2
## dbl (7): kmer_start, kmer_end, kmer_middle, length, rel_kmer_start, rel_kmer...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
DRS_methylated_positions |> 
  export_tsv(outdir = tabledir)
## 
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Tables/DRS_m3C_sites/Metagene_RNA/DRS_methylated_positions_2024-07-29.tsv
## # A tibble: 489 × 13
##    transcript_id     gene_name seqname gene_type    ref_kmer kmer_start kmer_end
##    <chr>             <chr>     <chr>   <chr>        <chr>         <dbl>    <dbl>
##  1 ENST00000429711.7 RPL32     chr3    protein_cod… GCCCA           423      427
##  2 ENST00000647248.2 RPL35A    chr3    protein_cod… ACCCC           381      385
##  3 ENST00000647248.2 RPL35A    chr3    protein_cod… CCCCT           382      386
##  4 ENST00000389680.2 MT-RNR1   chrM    Mt_rRNA      CCCCG            58       62
##  5 ENST00000389680.2 MT-RNR1   chrM    Mt_rRNA      ACCCT            76       80
##  6 ENST00000389680.2 MT-RNR1   chrM    Mt_rRNA      ATCAA            94       98
##  7 ENST00000389680.2 MT-RNR1   chrM    Mt_rRNA      GCCAC           149      153
##  8 ENST00000389680.2 MT-RNR1   chrM    Mt_rRNA      ACCCC           154      158
##  9 ENST00000389680.2 MT-RNR1   chrM    Mt_rRNA      CCCCC           155      159
## 10 ENST00000389680.2 MT-RNR1   chrM    Mt_rRNA      CCCCA           156      160
## # ℹ 479 more rows
## # ℹ 6 more variables: kmer_middle <dbl>, genetype2 <chr>, length <dbl>,
## #   rel_kmer_start <dbl>, rel_kmer_middle <dbl>, rel_kmer_end <dbl>

Transcript sequences

espresso_transcript_seqs <- 
  read_tsv(
    'Tables/Database/espresso_AsPC1_transcriptome_seqs_2024-04-22.tsv.gz' |> 
      paste_wd()
  ) |> 
  select(-transcript_length)
## Rows: 36717 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): transcript_id, transcript_seq
## dbl (1): transcript_length
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
espresso_transcript_seqs
## # A tibble: 36,717 × 2
##    transcript_id      transcript_seq                                            
##    <chr>              <chr>                                                     
##  1 ENST00000339437.11 AGCCCGGAAGTGCGCGTGGCGGCGGTGGCGGCTGCGGCAACAGCGGGGCCGATGTGT…
##  2 ENST00000251607.11 AGCCCGGAAGTGCGCGTGGCGGCGGTGGCGGCTGCGGCAACAGCGGGGCCGATGTGT…
##  3 ENST00000420393.5  CAGCGGGGCCGGTAAGCGGGCGCGCGCCGCTCAGAGGGGCAGAGTTGGTGGCGTGAG…
##  4 ENST00000698415.1  GATGTATGATGAGTTTAGTTGAATGCTCGTGTTGCTGTCTGCTAGCCAAAGACCAAC…
##  5 ENST00000698416.1  CATGACTAGTTTTGTGGGTAGCAATGATGTTTAAATGTCACACACTAACCTTTTTAA…
##  6 ENST00000488263.5  AGGAACTTCATCATGAAGTCTCAAGTAAACGAACATTTTATCTTTCTTGGGATTCTA…
##  7 ENST00000424814.5  GAGATCAGCAGGACGCTGCGCACAACATGGGCAACCACCTGCCGCTCCTGCCTGCAG…
##  8 ENST00000231948.9  AGACATGGCCGGCGAAGGAGATCAGCAGGACGCTGCGCACAACATGGGCAACCACCT…
##  9 ENST00000432408.6  GCCTCCTTTGCGGGTAAACAGACATGGCCGGCGAAGGAGATCAGCAGGACGCTGCGC…
## 10 ENST00000459840.5  ATGGAGGCATTTAAACTGGGACTGAGATGGGACTGAGTGATTAAATTGCTACCAGTG…
## # ℹ 36,707 more rows

List of methylated RNAs

DRS_methylated_RNAs <- 
  DRS_methylated_positions |> 
  select(transcript_id) |> 
  distinct()
DRS_methylated_RNAs
## # A tibble: 71 × 1
##    transcript_id    
##    <chr>            
##  1 ENST00000429711.7
##  2 ENST00000647248.2
##  3 ENST00000389680.2
##  4 ENST00000361390.2
##  5 ENST00000361453.3
##  6 ENST00000387347.2
##  7 ENST00000361624.2
##  8 ENST00000361739.1
##  9 ENST00000361899.2
## 10 ENST00000361227.2
## # ℹ 61 more rows

Annotation of the methylated RNAs

DRS_methylated_RNAs_annotation <- 
  DRS_methylated_positions |> 
  select(starts_with('gene'), starts_with('transcript'), length) |> 
  distinct()
DRS_methylated_RNAs_annotation
## # A tibble: 71 × 5
##    gene_name gene_type      genetype2 transcript_id     length
##    <chr>     <chr>          <chr>     <chr>              <dbl>
##  1 RPL32     protein_coding mRNA      ENST00000429711.7   2094
##  2 RPL35A    protein_coding mRNA      ENST00000647248.2   1234
##  3 MT-RNR1   Mt_rRNA        Mt_rRNA   ENST00000389680.2    954
##  4 MT-ND1    protein_coding mt-mRNA   ENST00000361390.2    956
##  5 MT-ND2    protein_coding mt-mRNA   ENST00000361453.3   1042
##  6 MT-RNR2   Mt_rRNA        Mt_rRNA   ENST00000387347.2   1559
##  7 MT-CO1    protein_coding mt-mRNA   ENST00000361624.2   1542
##  8 MT-CO2    protein_coding mt-mRNA   ENST00000361739.1    684
##  9 MT-ATP6   protein_coding mt-mRNA   ENST00000361899.2    681
## 10 MT-ND3    protein_coding mt-mRNA   ENST00000361227.2    346
## # ℹ 61 more rows

Prepare dataframe of base positions in the methylated RNAs

methylated_RNAs_base_positions <- 
  espresso_transcript_seqs |> 
  right_join(DRS_methylated_RNAs) |> 
  calc_base_position()
## Joining with `by = join_by(transcript_id)`
methylated_RNAs_base_positions
## # A tibble: 101,437 × 3
##    transcript_id     base  position
##    <chr>             <chr>    <dbl>
##  1 ENST00000429711.7 A            1
##  2 ENST00000429711.7 G            2
##  3 ENST00000429711.7 C            3
##  4 ENST00000429711.7 C            4
##  5 ENST00000429711.7 C            5
##  6 ENST00000429711.7 T            6
##  7 ENST00000429711.7 T            7
##  8 ENST00000429711.7 G            8
##  9 ENST00000429711.7 C            9
## 10 ENST00000429711.7 G           10
## # ℹ 101,427 more rows

Extract position of C bases

methylated_RNAs_C_positions <- 
  methylated_RNAs_base_positions |>
  filter(base == 'C')
methylated_RNAs_C_positions
## # A tibble: 24,117 × 3
##    transcript_id     base  position
##    <chr>             <chr>    <dbl>
##  1 ENST00000429711.7 C            3
##  2 ENST00000429711.7 C            4
##  3 ENST00000429711.7 C            5
##  4 ENST00000429711.7 C            9
##  5 ENST00000429711.7 C           11
##  6 ENST00000429711.7 C           13
##  7 ENST00000429711.7 C           14
##  8 ENST00000429711.7 C           16
##  9 ENST00000429711.7 C           17
## 10 ENST00000429711.7 C           20
## # ℹ 24,107 more rows

CC positions

methylated_RNAs_CC_positions <- 
  espresso_transcript_seqs |> 
  right_join(DRS_methylated_RNAs) |> 
  calc_CC_position() |> 
  calc_relposition_in_RNA() |> 
  mutate(type = 'all CC')
## Joining with `by = join_by(transcript_id)`
## Joining with `by = join_by(transcript_id)`
methylated_RNAs_CC_positions
## # A tibble: 5,498 × 8
##    transcript_id     kmer_middle gene_name gene_type      genetype2 length
##    <chr>                   <dbl> <chr>     <chr>          <chr>      <dbl>
##  1 ENST00000429711.7         3.5 RPL32     protein_coding mRNA        2094
##  2 ENST00000429711.7        13.5 RPL32     protein_coding mRNA        2094
##  3 ENST00000429711.7        16.5 RPL32     protein_coding mRNA        2094
##  4 ENST00000429711.7        20.5 RPL32     protein_coding mRNA        2094
##  5 ENST00000429711.7        32.5 RPL32     protein_coding mRNA        2094
##  6 ENST00000429711.7        43.5 RPL32     protein_coding mRNA        2094
##  7 ENST00000429711.7        59.5 RPL32     protein_coding mRNA        2094
##  8 ENST00000429711.7        65.5 RPL32     protein_coding mRNA        2094
##  9 ENST00000429711.7        82.5 RPL32     protein_coding mRNA        2094
## 10 ENST00000429711.7        85.5 RPL32     protein_coding mRNA        2094
## # ℹ 5,488 more rows
## # ℹ 2 more variables: rel_kmer_middle <dbl>, type <chr>

Join data

rel_position_allC_m3C <- 
  methylated_RNAs_C_positions |> 
  calc_relposition_in_RNA() |>
  mutate(type = 'allC') |> 
  bind_rows(methylated_RNAs_CC_positions) |> 
  bind_rows(DRS_methylated_positions |> mutate(type = 'm3C'))
## Joining with `by = join_by(transcript_id)`
rel_position_allC_m3C
## # A tibble: 30,104 × 15
##    transcript_id     base  kmer_middle gene_name gene_type      genetype2 length
##    <chr>             <chr>       <dbl> <chr>     <chr>          <chr>      <dbl>
##  1 ENST00000429711.7 C               3 RPL32     protein_coding mRNA        2094
##  2 ENST00000429711.7 C               4 RPL32     protein_coding mRNA        2094
##  3 ENST00000429711.7 C               5 RPL32     protein_coding mRNA        2094
##  4 ENST00000429711.7 C               9 RPL32     protein_coding mRNA        2094
##  5 ENST00000429711.7 C              11 RPL32     protein_coding mRNA        2094
##  6 ENST00000429711.7 C              13 RPL32     protein_coding mRNA        2094
##  7 ENST00000429711.7 C              14 RPL32     protein_coding mRNA        2094
##  8 ENST00000429711.7 C              16 RPL32     protein_coding mRNA        2094
##  9 ENST00000429711.7 C              17 RPL32     protein_coding mRNA        2094
## 10 ENST00000429711.7 C              20 RPL32     protein_coding mRNA        2094
## # ℹ 30,094 more rows
## # ℹ 8 more variables: rel_kmer_middle <dbl>, type <chr>, seqname <chr>,
## #   ref_kmer <chr>, kmer_start <dbl>, kmer_end <dbl>, rel_kmer_start <dbl>,
## #   rel_kmer_end <dbl>
rel_position_allC_m3C |> 
  export_tsv(outdir = tabledir, compression = 'gz')
## 
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Tables/DRS_m3C_sites/Metagene_RNA/rel_position_allC_m3C_2024-07-29.tsv.gz
## # A tibble: 30,104 × 15
##    transcript_id     base  kmer_middle gene_name gene_type      genetype2 length
##    <chr>             <chr>       <dbl> <chr>     <chr>          <chr>      <dbl>
##  1 ENST00000429711.7 C               3 RPL32     protein_coding mRNA        2094
##  2 ENST00000429711.7 C               4 RPL32     protein_coding mRNA        2094
##  3 ENST00000429711.7 C               5 RPL32     protein_coding mRNA        2094
##  4 ENST00000429711.7 C               9 RPL32     protein_coding mRNA        2094
##  5 ENST00000429711.7 C              11 RPL32     protein_coding mRNA        2094
##  6 ENST00000429711.7 C              13 RPL32     protein_coding mRNA        2094
##  7 ENST00000429711.7 C              14 RPL32     protein_coding mRNA        2094
##  8 ENST00000429711.7 C              16 RPL32     protein_coding mRNA        2094
##  9 ENST00000429711.7 C              17 RPL32     protein_coding mRNA        2094
## 10 ENST00000429711.7 C              20 RPL32     protein_coding mRNA        2094
## # ℹ 30,094 more rows
## # ℹ 8 more variables: rel_kmer_middle <dbl>, type <chr>, seqname <chr>,
## #   ref_kmer <chr>, kmer_start <dbl>, kmer_end <dbl>, rel_kmer_start <dbl>,
## #   rel_kmer_end <dbl>

Num sites

rel_position_allC_m3C |> 
  group_by(type, genetype2) |> 
  reframe(n = n())
## # A tibble: 12 × 3
##    type   genetype2     n
##    <chr>  <chr>     <int>
##  1 all CC Mt_rRNA     156
##  2 all CC mRNA       4608
##  3 all CC mt-mRNA     713
##  4 all CC <NA>         21
##  5 allC   Mt_rRNA     650
##  6 allC   mRNA      20548
##  7 allC   mt-mRNA    2815
##  8 allC   <NA>        104
##  9 m3C    Mt_rRNA      47
## 10 m3C    mRNA        257
## 11 m3C    mt-mRNA     182
## 12 m3C    <NA>          3

Plot

c(1/10, 1/5, 1/2, 1, 2, 5, 10) |> 
  map(plot_metagene_distribution_different_adjustment)
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]